In [1]:

    
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext autotime



In [2]:

    
from baselines import remove_na, tidy_labels, map_aggression_score_to_2class
import pandas as pd









    



time: 1.03 s

Clean Raw Annotations

Load raw annotations



In [3]:

    
"""
# v4_annotated
user_blocked = [
                'annotated_onion_layer_5_rows_0_to_5000_raters_20',     
                'annotated_onion_layer_5_rows_0_to_10000',             
                'annotated_onion_layer_5_rows_0_to_10000_raters_3',          
                'annotated_onion_layer_5_rows_10000_to_50526_raters_10',
                'annotated_onion_layer_10_rows_0_to_1000',              
                'annotated_onion_layer_20_rows_0_to_1000',              
                'annotated_onion_layer_30_rows_0_to_1000',              
]

user_random = [
            'annotated_random_data_rows_0_to_5000_raters_20',
            'annotated_random_data_rows_5000_to_10000',
            'annotated_random_data_rows_5000_to_10000_raters_3',
            'annotated_random_data_rows_10000_to_20000_raters_10',
]

article_blocked = ['article_onion_layer_5_all_rows_raters_10',]
article_random = ['article_random_data_all_rows_raters_10',]
"""

user_blocked = [
            'user_blocked',
            'user_blocked_2',
            'user_blocked_3',
            'user_blocked_4',
            'user_blocked_layer_10',
            'user_blocked_layer_20',
            'user_blocked_layer_30',
]

user_random = [
            'user_random',
            'user_random_2',
            'user_random_3',
            'user_random_4',
            'user_random_extra_baselines',

]

article_blocked = [ 'article_blocked',
                    'article_blocked_layer_5_extra_baselines' ]


article_random = ['article_random',
                  'article_random_extra_baselines']



files = {
    'user': {'blocked': user_blocked, 'random': user_random},
    'article': {'blocked': article_blocked, 'random': article_random}
}


dfs = []

for ns, d in files.items():
    for sample, files in  d.items():
        for f in files:
            df = pd.read_csv('../../data/annotations/raw/%s/%s.csv' % (ns,f))
            df['src'] = f
            df['ns'] = ns
            df['sample'] = sample
            dfs.append(df)
df = pd.concat(dfs)
print('# annotations: ', df.shape[0])









    



# annotations:  1524236
time: 23.1 s

Make random and blocked samples disjoint



In [4]:

    
df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts()









    Out[4]:





1    124631
2        93
Name: rev_id, dtype: int64






    



time: 1.52 s



In [5]:

    
df.index = df.rev_id
df.sample_count = df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts()









    



time: 251 ms



In [6]:

    
df.sample_count.value_counts()









    Out[6]:





1    124631
2        93
Name: rev_id, dtype: int64






    



time: 3.94 ms



In [7]:

    
# just set them all to random
df['sample'][df.sample_count == 2] = 'random'









    



time: 107 ms






    



/Users/ellerywulczyn/miniconda3/lib/python3.5/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [8]:

    
df.drop_duplicates(subset=['rev_id', 'sample'])['rev_id'].value_counts().value_counts()









    Out[8]:





1    124724
Name: rev_id, dtype: int64






    



time: 251 ms



In [9]:

    
del df.sample_count









    



time: 2.27 ms



In [10]:

    
print('# annotations: ', df.shape[0])









    



# annotations:  1524236
time: 1.07 ms

Tidy is_harassment_or_attack column



In [11]:

    
df = tidy_labels(df)









    



time: 5.93 s

Remap aggression score



In [12]:

    
df['aggression'] = df['aggression_score'].apply(map_aggression_score_to_2class)









    



time: 554 ms

Remove answers to test questions



In [13]:

    
df = df.query('_golden == False')
print('# annotations: ', df.shape[0])









    



# annotations:  1524236
time: 862 ms

Remove annotations where revision could not be read



In [14]:

    
# remove all annotations for a revisions where more than 50% of annotators for that revision could not read the comment
df = remove_na(df)
print('# annotations: ', df.shape[0])









    



# annotations:  1510976
time: 46.9 s



In [15]:

    
# remove all annotations where the annotator could not read the comment
df = df.query('na==False')
print('# annotations: ', df.shape[0])









    



# annotations:  1501494
time: 514 ms

Examine aggression_score or is_harassment_or_attack input



In [16]:

    
df['aggression_score'].value_counts(dropna=False)









    Out[16]:





 0.0    1081861
-1.0     144269
 1.0      92441
-3.0      74081
-2.0      66210
 2.0      29848
 3.0      11902
NaN         882
Name: aggression_score, dtype: int64






    



time: 21.3 ms



In [17]:

    
df['is_harassment_or_attack'].value_counts(dropna=False)









    Out[17]:





not_attack                                            1213696
recipient                                              150911
other                                                   40457
third_party                                             33592
recipient\nthird_party                                  10046
other\nnot_attack                                        9283
recipient\nnot_attack                                    6967
quoting                                                  6596
recipient\nthird_party\nquoting\nother\nnot_attack       5969
recipient\nother                                         4408
recipient\nthird_party\nquoting\nother                   2504
recipient\nthird_party\nnot_attack                       2496
third_party\nother                                       1906
recipient\nthird_party\nother                            1863
quoting\nnot_attack                                      1630
recipient\nthird_party\nquoting                          1606
recipient\nthird_party\nquoting\nnot_attack              1392
third_party\nnot_attack                                  1300
quoting\nother\nnot_attack                               1230
recipient\nother\nnot_attack                              830
quoting\nother                                            642
third_party\nquoting                                      610
recipient\nquoting                                        476
third_party\nquoting\nother                               361
recipient\nquoting\nnot_attack                            212
recipient\nquoting\nother                                 129
third_party\nquoting\nnot_attack                          117
third_party\nother\nnot_attack                             89
recipient\nthird_party\nother\nnot_attack                  66
NaN                                                        41
third_party\nquoting\nother\nnot_attack                    38
recipient\nquoting\nother\nnot_attack                      31
Name: is_harassment_or_attack, dtype: int64






    



time: 115 ms

Drop NAs in aggression_score or is_harassment_or_attack input



In [18]:

    
df = df.dropna(subset = ['aggression_score', 'is_harassment_or_attack'])
print('# annotations: ', df.shape[0])









    



# annotations:  1500571
time: 915 ms

Remove ambivalent is_harassment_or_attack annotations

An annotations is ambivalent if it was labeled as both an attack and not an attack



In [19]:

    
# remove all annotations from users who are ambivalent in 10% or more of revisions
# we consider these users unreliable
def ambivalent(s):
    return 'not_attack' in s and s!= 'not_attack'
df['ambivalent'] = df['is_harassment_or_attack'].apply(ambivalent)
non_ambivalent_workers = df.groupby('_worker_id', as_index = False)['ambivalent'].mean().query('ambivalent < 0.1')
df = df.merge(non_ambivalent_workers[['_worker_id']], how = 'inner', on = '_worker_id')
print('# annotations: ', df.shape[0])









    



# annotations:  1439146
time: 5.58 s



In [20]:

    
# remove all other ambivalent annotations
df = df.query('ambivalent==False')
print('# annotations: ', df.shape[0])









    



# annotations:  1434257
time: 2.58 s

Make sure that each rev was only annotated by the same worker once



In [21]:

    
df.groupby(['rev_id', '_worker_id']).size().value_counts()









    Out[21]:





1    1431503
2       1377
dtype: int64






    



time: 569 ms



In [22]:

    
df = df.drop_duplicates(subset = ['rev_id', '_worker_id'])
print('# annotations: ', df.shape[0])









    



# annotations:  1432880
time: 1.17 s

Filter out annotations for revisions with duplicated diff content



In [23]:

    
comments = df.drop_duplicates(subset = ['rev_id'])
print(comments.shape[0])









    



123633
time: 154 ms



In [24]:

    
u_comments = comments.drop_duplicates(subset = ['clean_diff'])
print(u_comments.shape[0])









    



120218
time: 195 ms



In [25]:

    
comments[comments.duplicated(subset = ['clean_diff'])].head(5)









    Out[25]:






  
    
      
      _aggression_score
      _channel
      _city
      _country
      _created_at
      _golden
      _id
      _ip
      _is_harassment_or_attack
      _missed
      ...
      user_id
      user_text
      not_attack
      other
      quoting
      recipient
      third_party
      attack
      aggression
      ambivalent
    
  
  
    
      825
      NaN
      neodev
      Belgrade
      SRB
      4/20/2016 14:37:26
      False
      1965035223
      109.92.158.251
      NaN
      NaN
      ...
      20335199.0
      Linkiscool99
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      False
    
    
      1316
      NaN
      clixsense
      Rio De Janeiro
      BRA
      5/25/2016 17:40:23
      False
      1999580751
      186.221.107.247
      NaN
      NaN
      ...
      9897.0
      Kwekubo
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      False
    
    
      1551
      NaN
      clixsense
      Rio De Janeiro
      BRA
      5/8/2016 13:47:38
      False
      1979100431
      186.221.148.47
      NaN
      NaN
      ...
      11496785.0
      M-m-moot
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      False
    
    
      3103
      NaN
      neodev
      Cairo
      EGY
      4/21/2016 10:51:20
      False
      1965872978
      197.44.120.129
      NaN
      NaN
      ...
      10928492.0
      Horse Manure Again
      0.0
      1.0
      0.0
      0.0
      0.0
      1.0
      1.0
      False
    
    
      3754
      NaN
      neodev
      Belgrade
      SRB
      5/8/2016 14:57:10
      False
      1979194169
      77.46.214.221
      NaN
      NaN
      ...
      16328760.0
      DavisJune
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      False
    
  

5 rows × 52 columns







    



time: 90.6 ms



In [26]:

    
df = df.merge(u_comments[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])









    



# annotations:  1395983
time: 2.96 s

Check that labels are not None



In [27]:

    
df['recipient'].value_counts(dropna=False)









    Out[27]:





0.0    1240903
1.0     155080
Name: recipient, dtype: int64






    



time: 18.8 ms



In [28]:

    
df['attack'].value_counts(dropna=False)









    Out[28]:





0.0    1163956
1.0     232027
Name: attack, dtype: int64






    



time: 14.2 ms



In [29]:

    
df['aggression'].value_counts(dropna=False)









    Out[29]:





0.0    1141434
1.0     254549
Name: aggression, dtype: int64






    



time: 15.3 ms

Remove annotations from all revisions that were annotated less than 8 times



In [30]:

    
counts = df['rev_id'].value_counts().to_frame()
counts.columns = ['n']
counts['rev_id'] = counts.index









    



time: 41.2 ms



In [31]:

    
counts.shape









    Out[31]:





(120218, 2)






    



time: 1.82 ms



In [32]:

    
counts['n'].value_counts().head()









    Out[32]:





10    56283
9     29208
8      7469
19     6907
20     6190
Name: n, dtype: int64






    



time: 3.79 ms



In [33]:

    
counts_enough = counts.query("n>=8")









    



time: 11.2 ms



In [34]:

    
counts_enough.shape









    Out[34]:





(116179, 2)






    



time: 2.07 ms



In [35]:

    
df = df.merge(counts_enough[['rev_id']], how = 'inner', on = 'rev_id')
print('# annotations: ', df.shape[0])









    



# annotations:  1368958
time: 1.05 s

Discard nuisance columns



In [50]:

    
df.columns









    Out[50]:





Index(['_aggression_score', '_channel', '_city', '_country', '_created_at',
       '_golden', '_id', '_ip', '_is_harassment_or_attack', '_missed', '_na',
       '_region', '_started_at', '_tainted', '_trust', '_unit_id',
       '_worker_id', 'aggression_score', 'aggression_score_gold',
       'aggression_score_gold_reason', 'block_actions', 'block_params',
       'block_reasons', 'block_timestamps', 'clean_diff', 'diff',
       'insert_only', 'is_harassment_or_attack',
       'is_harassment_or_attack_gold', 'is_harassment_or_attack_gold_reason',
       'na', 'na_gold', 'na_gold_reason', 'ns', 'orig__golden', 'page_id',
       'page_title', 'rev_comment', 'rev_id', 'rev_timestamp', 'sample', 'src',
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression', 'ambivalent'],
      dtype='object')






    



time: 2.36 ms



In [36]:

    
cols = ['rev_id', '_worker_id', 'ns', 'sample', 'src','clean_diff', 'diff', 'insert_only', 'page_id',
       'page_title', 'rev_comment', 'rev_timestamp', 
       'user_id', 'user_text', 'not_attack', 'other', 'quoting', 'recipient',
       'third_party', 'attack', 'aggression', 'aggression_score']
df = df[cols]









    



time: 1.59 s

Summary Stats



In [41]:

    
df.groupby(['ns', 'sample']).size()









    Out[41]:





ns       sample 
article  blocked    351106
         random     233073
user     blocked    534054
         random     250725
dtype: int64






    



time: 359 ms



In [42]:

    
df.to_csv('../../data/annotations/clean/annotations.tsv', index=False, sep='\t')









    



time: 44.6 s



In [43]:

    
pd.read_csv('../../data/annotations/clean/annotations.tsv', sep='\t').shape









    Out[43]:





(1368958, 22)






    



time: 14.8 s

	_aggression_score	_channel	_city	_country	_created_at	_golden	_id	_ip	_is_harassment_or_attack	_missed	...	user_id	user_text	not_attack	other	attack	aggression	ambivalent
825	NaN	neodev	Belgrade	SRB	4/20/2016 14:37:26	False	1965035223	109.92.158.251	NaN	NaN	...	20335199.0	Linkiscool99	1.0	0.0	0.0	0.0	False
1316	NaN	clixsense	Rio De Janeiro	BRA	5/25/2016 17:40:23	False	1999580751	186.221.107.247	NaN	NaN	...	9897.0	Kwekubo	1.0	0.0	0.0	0.0	False
1551	NaN	clixsense	Rio De Janeiro	BRA	5/8/2016 13:47:38	False	1979100431	186.221.148.47	NaN	NaN	...	11496785.0	M-m-moot	1.0	0.0	0.0	0.0	False
3103	NaN	neodev	Cairo	EGY	4/21/2016 10:51:20	False	1965872978	197.44.120.129	NaN	NaN	...	10928492.0	Horse Manure Again	0.0	1.0	1.0	1.0	False
3754	NaN	neodev	Belgrade	SRB	5/8/2016 14:57:10	False	1979194169	77.46.214.221	NaN	NaN	...	16328760.0	DavisJune	1.0	0.0	0.0	0.0	False